# importing libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
sns.set(style='white')
data=pd.read_csv("/Users/vipulbhatia29/JupyterAnalytics/storedata/churn_prediction.csv")
data.head(10)
data.describe(include="all")
Describe command shows the summary of the numeric data. As you see below:
data.shape
data.describe()
data[data.dtypes[data.dtypes=="int64"].index]
data[data.dtypes[data.dtypes=="float64"].index]
data[data.dtypes[data.dtypes=="object"].index]
data.dtypes
data.columns
data["gender"]=data["gender"].astype("category")
data["occupation"]=data["occupation"].astype("category")
data["city"]=data["city"].astype("category")
data["customer_nw_category"]=data["customer_nw_category"].astype("category")
data["branch_code"]=data["branch_code"].astype("category")
#data["churn"]=data["churn"].astype("category")
data["dependents"]=data["dependents"].fillna(-1).astype("int")
data.dtypes
date=pd.DatetimeIndex(data['last_transaction'])
# last day of year when transaction was done
data['doy_ls_tran'] = date.dayofyear
# week of year when last transaction was done
data['woy_ls_tran'] = date.weekofyear
# month of year when last transaction was done
data['moy_ls_tran'] = date.month
# day of week when last transaction was done
data['dow_ls_tran'] = date.dayofweek
#drop last_transaction column
data=data.drop(['last_transaction'],axis=1)
data.dtypes
numerical_columns=data.select_dtypes(include=['int64','float64']).columns
#segregate the continuous variables into groups to breakdown teh analysis
customer_details=['customer_id','vintage','age']
current_month=['current_balance','current_month_credit','current_month_debit','current_month_balance']
previous_month=['previous_month_end_balance','previous_month_credit','previous_month_debit','previous_month_balance']
previous_quarters=['average_monthly_balance_prevQ','average_monthly_balance_prevQ2']
transaction_date = ['doy_ls_tran','woy_ls_tran','moy_ls_tran','dow_ls_tran']
data['churn'].value_counts(normalize=True)
## only 18% of the data consists of customers which have churned
def UVA_numeric(data, feature_set):
ncols=len(feature_set)
nrows=int(np.ceil(len(feature_set)/ncols))
#fig,axes=plt.subplots(nrows=nrows,ncols=ncols,figsize=(35,10))
fig,axes=plt.subplots(nrows=nrows,ncols=ncols,figsize=(7*len(feature_set),3),dpi=100)
counter=0
for i in range(nrows):
for j in range(ncols):
#ax=axes[i][j]
ax=axes[j]
max_c=data[feature_set[counter]].max()
min_c=data[feature_set[counter]].min()
range_c=max_c-min_c
mean_c=data[feature_set[counter]].mean()
median_c=data[feature_set[counter]].median()
std_c=data[feature_set[counter]].std()
skew_c=data[feature_set[counter]].skew()
kurtosis_c=data[feature_set[counter]].kurtosis()
#print([numerical_columns[counter]+ ": skew: " +str(data[data[numerical_columns].columns[counter]].skew()),"kurtosis: "+str(data[data[numerical_columns].columns[counter]].kurtosis())])
#print(d_kurtosis=data[data[numerical_columns].columns[counter]].kurtosis())
try:
#sns.kdeplot(data[data[numerical_columns].columns[counter]],shade=True,ax=ax)
sns.kdeplot(data[feature_set[counter]],shade=True,ax=ax)
except:
#ax.hist(data[data[numerical_columns[counter]]<=10][numerical_columns[counter]],bins=10)
ax.hist(data[feature_set[counter]],bins=10)
#sns.scatterplot([data[data[numerical_columns].columns[counter]].mean()],[0],color='red',label='mean',ax=ax)
#sns.scatterplot([data[data[numerical_columns].columns[counter]].median()],[0],color='green',label='median',ax=ax)
#sns.lineplot([data[data[numerical_columns].columns[counter]].mean()+data[data[numerical_columns].columns[counter]].std(),data[data[numerical_columns].columns[counter]].mean()-data[data[numerical_columns].columns[counter]].std()],[0,0],color='black',label='std',ax=ax)
sns.scatterplot([data[feature_set[counter]].mean()],[0],color='red',label='mean',ax=ax)
sns.scatterplot([data[feature_set[counter]].median()],[0],color='green',label='median',ax=ax)
sns.lineplot([data[feature_set[counter]].mean()+data[feature_set[counter]].std(),data[feature_set[counter]].mean()-data[feature_set[counter]].std()],[0,0],label='std',color='black',ax=ax)
#ax.text(3, 2, 'unicode: Institut für Festkörperphysik')
ax.set_title('skew: {}; kurtosis: {};\nmean: {}; median: {};\nrange: {}; std_dev: {};'.format(round(skew_c,2),round(kurtosis_c,2),round(mean_c,2),round(median_c,2),(round(min_c,2),round(max_c,2),round(range_c,2)), round(std_c,2) ),fontsize=10)
ax.tick_params(axis='x',labelrotation=45)
ax.set_xlabel(feature_set[counter],fontsize=10)
ax.set_ylabel('probability density',fontsize=10)
counter=counter+1
#plt.legend()
plt.show()
UVA_numeric(data,customer_details)
Analysis/Insights from Customer_Information:
customer_id:
vintage:
age:
Things to Investigate Further down the road:
UVA_numeric(data,current_month)
Analysis/Insights from current_month
# standard deviation factor
factor = 3
# copying current_month
cm_data = data[current_month]
# filtering using standard deviation (not considering obseravtions > 3* standard deviation)
cm_data = cm_data[cm_data['current_balance'] < factor*cm_data['current_balance'].std()]
cm_data = cm_data[cm_data['current_month_credit'] < factor*cm_data['current_month_credit'].std()]
cm_data = cm_data[cm_data['current_month_debit'] < factor*cm_data['current_month_debit'].std()]
cm_data = cm_data[cm_data['current_month_balance'] < factor*cm_data['current_month_balance'].std()]
# checking how many points removed
len(data), len(cm_data)
UVA_numeric(cm_data,current_month)
Analysis/Insights from current_month (after removing data > 3 standard deviations towards right of the mean)
Things to investigate further down
UVA_numeric(data,previous_month)
Analysis/Insights from previous_month
UVA_numeric(data,previous_quarters)
Analysis/Insights from Previous Quarters
UVA_numeric(data,transaction_date)
Analysis/Insights from Transaction Date
Day_of_Year:
Week_of_year and Month_of_year: these variable validate the findings from the day_of_year.
Things to investigate further Down
#index resetting so indexes can be used as regular columns in dataframe
data[data['churn']==1][['customer_id','churn','gender']].groupby(['churn','gender']).count().reset_index()
#MultiIndex filtering
data[data['churn']==1][['customer_id','churn','gender']].groupby(['churn','gender']).count().loc[[(1,'Male')]]
#Index filtering using like
data[data['churn']==1][['customer_id','churn','gender']].groupby(['churn','gender']).count().filter(like='1' ,axis=0).filter(like='Female',axis=0)
#histogram throws an exception for NaN values
plt.hist(pd.DataFrame(data[data['churn']==1]['occupation'])['occupation'].fillna('self_employed'))
plt.show()
# distplot can be used to look at frequency as well as kdeplots
val=data[data['churn']==1]['churn'].value_counts()[1]
ax=sns.distplot(pd.DataFrame(data[data['churn']==1]['age'])['age'],bins=10)
for p in ax.patches:
print(p)
ax.text(p.get_x()+p.get_width()/2,p.get_height(),str(round(p.get_height()*val*p.get_width(),2)),ha='center',rotation=90)
#ax.text(p.get_x()+p.get_width()/2,p.get_height(),str(round(p.get_height()*number_of_data_points/100,0)),ha='center')
plt.ylabel("Density")
plt.show()
ax=plt.hist(pd.DataFrame(data[data['churn']==1]['age'])['age'],bins=10,normed=False)
plt.ylabel("Density")
plt.show()
data[data['churn']==1]['age'].value_counts(bins=10,normalize=False)
data.select_dtypes(exclude=['int64','float64','Int64']).dtypes
#segregate the categorical variables into groups to breakdown the analysis
customer_info=['gender','occupation','customer_nw_category','churn']
customer_accounts_info=['city','branch_code']
#churn doesn't need to be grouped
def UVA_Categorical(data,feature_set,annotation=True):
ncols=len(feature_set)
nrows=int(len(feature_set)/ncols)
fig,axes=plt.subplots(nrows=nrows,ncols=ncols,figsize=(7*len(feature_set),6),dpi=100)
counter=0
n=10
for i in range(nrows):
for j in range(ncols):
ax=axes[j]
feature_vc_pct=data[feature_set[counter]].value_counts()
feature_nunique=data[feature_set[counter]].nunique()
number_of_data_points=int(data.groupby(feature_set[counter]).count().sum()[0])
if int(feature_nunique)>=n:
#print('issu')
g=sns.barplot(x=data[feature_set[counter]].value_counts(normalize=True).sort_values(ascending=False)[:n].index, y=data[feature_set[counter]].value_counts(normalize=True).sort_values(ascending=False)[:n],ax=ax)
else:
g=sns.barplot(y=data[feature_set[counter]].value_counts(normalize=True)*100,x=data[feature_set[counter]].value_counts().index,ax=ax)
if annotation==True:
for p in g.patches:
g.text(p.get_x()+p.get_width()/2,p.get_height(),str(round(p.get_height()*number_of_data_points/100,0)),ha='center')
ax.set_title("unique categories: {}".format(feature_nunique))
ax.set_xlabel(feature_set[counter],fontsize=15)
ax.set_ylabel(feature_set[counter]+" distribution in percentage",fontsize=15)
counter+=1
UVA_Categorical(data,customer_info)
Analysis/Insight for Customer Info
Gender:
Occupation
customer_nw_category:
Churn:
Things to investigate:
#Plotting "city"
plt.figure(figsize = (20,10), dpi = 120)
n=100
city_count = data['city'].value_counts(normalize=True)
ax=sns.barplot(city_count.sort_values(ascending=False)[:n].index, 100*city_count.sort_values(ascending=False)[:n] , order = city_count.sort_values(ascending=False)[:n].index)
ax.text(30,11,'Number of unique cities:'+ str(data['city'].nunique())+"\nThis will require further analysis to understand\nif the most amount of extreme value transactions are contributed by large cities",fontsize=20)
plt.xlabel('Top 100 Cities')
plt.xticks(rotation=90)
plt.ylabel('Top 100 Cities transaction distribution by percentage')
plt.show()
#Plotting "branch_codes"
plt.figure(figsize = (20,10), dpi = 120)
n=100
branch_code = data['branch_code'].value_counts(normalize=True)
ax=sns.barplot(branch_code.sort_values(ascending=False)[:n].index, 100*branch_code.sort_values(ascending=False)[:n] , order = branch_code.sort_values(ascending=False)[:n].index)
ax.text(30,0.45,'Number of unique branch codes:'+ str(data['branch_code'].nunique())+"\nThis will require further analysis to understand\nif the most amount of extreme value transactions are contributed by popular branches",fontsize=20)
plt.xlabel('Top '+str(n)+' Branch Codes')
plt.xticks(rotation=90)
plt.ylabel('Top '+str(n)+' Branch Codes transaction distribution by percentage')
plt.show()
Analysis/Insights for Customer Accounts Info: for both variable "city" and "branch_code", there are too many categories.
Things to investigate further Down
data.isnull().sum()
Things to investigate further down:
gender: Do the customers with missing gender values have some common behaviour in-
occupation:
city:
last_transaction:
For almost all the above:
def UVA_outlier(data, feature_set, include_outlier = True):
data1=data.copy()
ncols=len(feature_set)
nrows=int(len(feature_set)/ncols)
counter=0
fig,axes=plt.subplots(nrows=nrows,ncols=ncols,figsize=(7*len(feature_set),6),dpi=100)
for i in range(nrows):
for j in range(ncols):
ax=axes[j]
q25=data1[feature_set[counter]].quantile(0.25)
q50=data1[feature_set[counter]].quantile(0.5)
q75=data1[feature_set[counter]].quantile(0.75)
iqr=(q75-q25)
whis_low=(q50-(1.5*iqr))
whis_high=(q50+(1.5*iqr))
outlier_hval=data1[data1[feature_set[counter]]>whis_high].shape[0]
outlier_lval=data1[data1[feature_set[counter]]<whis_low].shape[0]
outlier_count=outlier_hval+outlier_lval
data1[feature_set[counter]]=data1[feature_set[counter]].apply(lambda x: whis_low-1 if x<whis_low else x)
data1[feature_set[counter]]=data1[feature_set[counter]].apply(lambda x: whis_high+1 if x>whis_high else x)
if include_outlier == True:
sns.boxplot(data[feature_set[counter]],ax=ax)
else:
sns.boxplot(data1[feature_set[counter]],ax=ax)
ax.set_title("whiskers: {};\noutlier_counts(high/low/total): {};\n q25_q50_q75: {};\nIQR: {}".format((round(whis_low,2),round(whis_high,2)),(outlier_hval,outlier_lval,outlier_count),(round(q25,2),round(q50,2),round(q75,2)),round(iqr,0)))
ax.tick_params(axis='x',labelrotation=90)
counter+=1
UVA_outlier(data,current_month,True)
UVA_outlier(data,current_month,False)
UVA_outlier(data, previous_month, include_outlier=True)
UVA_outlier(data, previous_month, include_outlier=False)
Analysis/Insights from Box and Whisker plots for current_month and previous_month:
If we look at corresponding plots in the outputs above, there seems to be a strong relation between the corresponding plots of previous_month and current_month variables.
Outliers are significant in number and very similar in number between corresponding plots. Which indicates some inherent undiscovered behviour of Outliers.
UVA_outlier(data,previous_quarters)
UVA_outlier(data,previous_quarters,False)
Analysis/Insights from Box and Whisker plots for from Quarterly Balances:
population = data['vintage']
population = pd.DataFrame(population)
population.hist(bins=10)
population['vintage'].mean()
# Create a list
sampled_means = []
# For 1000 times,
for i in range(0,1000):
# Take a random sample of 100 rows from the population, take the mean of those rows, append to sampled_means
sampled_means.append(population.sample(n=100).mean())
# plotting histogram
plt.figure(dpi = 120)
plt.hist(np.asarray(sampled_means).reshape(np.asarray(sampled_means).shape[0],),bins=100)
plt.xlabel("Vintage")
plt.ylabel('Frequency')
plt.title("Histogram : Mean of samples of Vintage")
plt.show()
plt.figure(dpi = 120)
sns.distplot(np.asarray(sampled_means).reshape(np.asarray(sampled_means).shape[0],),bins=100)
plt.xlabel('vintage')
plt.ylabel('probability density function')
plt.show()
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
np.random.seed(42)
x = np.random.normal(size=1000)
print(type(x))
print(x.shape)
plt.hist(x, density=True, bins=30) # `density=False` would make counts
plt.ylabel('Probability')
plt.xlabel('Data');
mean_error=population.mean()-np.asarray(sampled_means).reshape(np.asarray(sampled_means).shape[0],).mean()
print(mean_error)
data[numerical_columns].dropna().shape[0]/data[numerical_columns].shape[0]
print("The percentage of data that has null values in the dataframe is:"+"{:.2%}".format(1-(data[numerical_columns].dropna().shape[0]/data[numerical_columns].shape[0])))
data_non_null=data[numerical_columns].dropna().copy()
#by default the correlation matrix calculation that is used is based on Pearson's Correlation which is based on COV(X,Y)
corr=data_non_null.corr()
corr
corr_unstack=corr.abs().unstack().sort_values(kind="quicksort",ascending=True).reset_index()
corr_unstack['corr_val']=corr_unstack[0]
corr_unstack=corr_unstack.drop(columns=0)
corr_unstack2=corr_unstack[corr_unstack['level_0']!=corr_unstack['level_1']].copy()
corr_unstack2[corr_unstack2['corr_val']>0.0].sort_values(by='corr_val',ascending=False)
corr_models=['pearson','kendall','spearman']
#for i in enumerate(corr_models):
# print (i)
cmap_j=["Blues","Reds","Greens"]
ncols=len(corr_models)
nrows=int(len(corr_models)/ncols)
fig,axes=plt.subplots(nrows=nrows,ncols=ncols,figsize=(36,8),dpi=140)
counter=0
for i in range(nrows):
for j in range(ncols):
ax=axes[j]
correlation = data_non_null.corr(method=corr_models[counter])
sns.heatmap(correlation, linewidth=2,annot=False,cmap=cmap_j[j],ax=ax)
ax.set_title(corr_models[counter],fontsize=20)
counter=counter+1
#current_month previous_month previous_quarters
list_amount_features=[]
list_amount_features.extend(current_month)
list_amount_features.extend(previous_month)
list_amount_features.extend(previous_quarters)
corr_models=['pearson','kendall','spearman']
#for i in enumerate(corr_models):
# print (i)
cmap_j=["Blues","Reds","Greens"]
ncols=len(corr_models)
nrows=int(len(corr_models)/ncols)
fig,axes=plt.subplots(nrows=nrows,ncols=ncols,figsize=(36,8),dpi=140)
counter=0
for i in range(nrows):
for j in range(ncols):
ax=axes[j]
correlation = data_non_null[list_amount_features].corr(method=corr_models[counter])
sns.heatmap(correlation, linewidth=2,annot=False,cmap=cmap_j[j],ax=ax)
ax.set_title(corr_models[counter],fontsize=20)
counter=counter+1
Analysis/Insights:
transactions = ['current_month_credit','current_month_debit','previous_month_credit','previous_month_debit']
plt.figure(dpi=140)
sns.pairplot(data[transactions])
plt.show()
# the plots above are not meaningful because of the outlier values
# A lot of data is clustered around lower magnitude values. So taking a log of the values will help us visualize the plot better. We can also use SatndardScaler from Scikit to stadardize the values around standard deviation.
# We will use StandardScaler during feature Engineering
# Since, the logs of negative numbers and zeros are undefined, we will shift the scale of these scalar numbers. this will not affect the correlation. It will just help us plot the logs properly and visualize the correlation
data_including_nulls=data[numerical_columns].copy()
for column in list_amount_features:
min_add=1
if data[column].min()<0:
min_add = abs(data_including_nulls[column].min()) + 1
data_including_nulls[column]=data_including_nulls[column].map(lambda x:x+min_add)
data_including_nulls[column]=data_including_nulls[column].map(lambda x:np.log(x))
plt.figure(dpi=140)
sns.pairplot(data_including_nulls[transactions])
plt.show()
Analysis/Insights gathered from Transaction Data
balance = ['previous_month_end_balance','previous_month_balance','current_balance','current_month_balance']
plt.figure(dpi=140)
sns.pairplot(data_including_nulls[balance])
plt.show()
Analysis/Insights
# previous quarters
plt.figure(dpi=140)
sns.scatterplot(data_including_nulls['average_monthly_balance_prevQ'], data_including_nulls['average_monthly_balance_prevQ2'])
# previous quarters
plt.figure(dpi=140)
p1=sns.kdeplot(data_including_nulls['average_monthly_balance_prevQ'], shade=True, color="r",label='PQ')
p2=sns.kdeplot(data_including_nulls['average_monthly_balance_prevQ2'], shade=True, color="b",label='PQ2')
List of Hypothesis and investigation to perform under this combination.
#from numpy import sqrt, abs, round
from scipy.stats import norm
from scipy.stats import t as t_dist
def BV_compare_features(data,compare,category,category_val):
temp_data=data.copy()
x1=temp_data[compare][temp_data[category]==category_val]
x0=temp_data[compare][~(temp_data[category]==category_val)]
m1=x1.mean()
m0=x0.mean()
med1=x1.median()
med0=x0.median()
medians=[med1,med0]
#print(m1,m0)
n1=x1.shape[0]
n0=x0.shape[0]
std1, std0 = x1.std(), x0.std()
ovr_sigma=np.sqrt(std1**2/n1 + std0**2/n0)
df = n1+n0-2
Z_val,T_val=(m1-m0)/ovr_sigma,(m1-m0)/ovr_sigma
Z_pval = 2*(1 - norm.cdf(abs(Z_val))) #Z_pval: this value compared to 0.05 helps us assess our hypothesis. If this value is greater than 0.05, then our original hypothesis is true. Otherwise our alternate hypothesis is true
T_pval = 2*(1 - t_dist.cdf(abs(T_val),df)) #T_pval
plt.figure(figsize = (15,6))
table = pd.pivot_table(data=temp_data, values=compare, columns=category, aggfunc = np.mean)
#barplot
plt.subplot(1,2,1)
sns.barplot(['not {}'.format(category),str(category)], [m0, m1])
plt.ylabel('mean {}'.format(compare))
plt.xlabel(category)
plt.title('t-test p-value = {} \n z-test p-value = {}\n {}'.format(T_pval,
Z_pval,
table))
# boxplot
plt.subplot(1,2,2)
box=sns.boxplot(x=category, y=compare, data=temp_data)
for xtick in box.get_xticks():
# print(xtick)
# print(medians[xtick])
box.text(xtick,medians[xtick]*1.01,'median: {}'.format(medians[xtick]),horizontalalignment='center',size='small',color='black',weight='semibold')
plt.title('categorical boxplot')
Null Hypothesis: M(churn)<M(not_churn)
BV_compare_features(data, 'vintage', 'churn', 1)
Analysis/Insight gathered from Vintage and Churn
Hypothesis: Are vintage customers less likely to churn?
Results:
Observation1: P_Val values in both Z_Test and T_Test return 0.42 which is significantly greater than 0.05 which tells us that the alternate hypothesis that Vintage has a significant effect on Churn has been proven wrong.
Observation2: Mean values of the customers that churn and the customers that did not churn are almost the same which validates the results fom Z_Test and T_Test.
Observation3: Boxplot shows similar distribution with outliers on the lower end.
Conclusion: We can reject the Alternate Hypothesis (vintage customers are less likely to churn).
Null Hypothesis: M(churn)<M(not_churn)
BV_compare_features(data, 'average_monthly_balance_prevQ', 'churn', 1)
Analysis/Insight gathered from average_monthly_balance_prevQ and Churn
Hypothesis: Are customers with high average monthy balance less likely to churn?
Observation1: P_Val values in both Z_Test and T_Test return 0.017 which is significantly lower than 0.05 which tells us that the alternate hypothesis is accepted.
Observation2: Mean values of the customers that churn had high average balance than the customers that did not churn, which validates the results from Z_Test and T_Test.
Conclusion: We can accept the Alternate Hypothesis (customers with higher average balance in previous quarters are more likely to churn than the customers who have lower average balance).
BV_compare_features(data, 'average_monthly_balance_prevQ2', 'churn', 1)
Analysis/Insight gathered from average_monthly_balance_prevQ2 and Churn
Hypothesis: Are customers with high average monthy balance less likely to churn?
Observation1: P_Val values in both Z_Test and T_Test return 0.03 which is significantly lower than 0.05 which tells us that the alternate hypothesis is accepted.
Observation2: Mean values of the customers that churn had high average balance than the customers that did not churn, which validates the results from Z_Test and T_Test.
Conclusion: We can accept the Alternate Hypothesis (customers with higher average balance in last to last quarter are more likely to churn than the customers who had lower average balance in the same quarters).
BV_compare_features(data, 'previous_month_balance', 'churn', 1)
BV_compare_features(data, 'current_month_balance', 'churn', 1)
Analysis/Insight gathered from average_monthly_balance in current/previous months and Churn
Hypothesis: Are customers with high previous month/current month balance less likely to churn?
Observation1: P_Val values in both Z_Test and T_Test vary significantly between previous month and current month which is also validated by the mean values. Which means that the customers who churned had high balance in previous month and their current month balance dropped
Conclusion: We can use this intuition to perform another test whether customers who balances drop siginificantly are highly likely to churn.
Null Hypothesis: M(drop in balance of churn)<M(drop in balance of not churn)
difference = data[['churn','previous_month_balance','current_month_balance']]
difference['bal_diff'] = difference['current_month_balance']-difference['previous_month_balance']
BV_compare_features(difference, 'bal_diff', 'churn', 1)
Analysis/Insight gathered from customers' dropping balances and Churn
Hypothesis: Are customers with high drop in the balances between previous month and current month less likely to churn?
Observation1: P_Val values in both Z_Test and T_Test are signifgicantly lower than 0.05 and the average current month balance dropped significantly which explains that the customers with a significant drop are very highly likely to churn
Conclusion: Customers who have a very high drop in the balances are more likely to churn.
Missing Values - finding behaviour
Gender:
Dependents:
Occupation:
O: Observed Value <br>
E: Expected Value <br>
$$ \tilde{\chi}^2=\frac{1}{d}\sum_{k=1}^{n} \frac{(O_k - E_k)^2}{E_k} $$
from scipy.stats import chi2_contingency
def BV_Compare_Feature_Categorical(data,compare,category):
temp_data=data[[compare,category]].copy()
temp_data_cross_tab=pd.crosstab(temp_data[category],temp_data[compare])
f_obs = np.array([temp_data_cross_tab.iloc[0].values,
temp_data_cross_tab.iloc[1].values])
#print(temp_data_cross_tab)
#print(f_obs)
chi, p, dof, expected = chi2_contingency(f_obs)
#checking whether results are significant
if p<0.05:
sig = True
else:
sig = False
#print("p_value:"+str(p))
#print("chi_value"+str(chi))
plt.figure(figsize = (15,50))
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(15,6))
g=sns.countplot(x=compare, hue=category, data=temp_data,ax=axes[0])
g.set_title("chi2-value: {};\np-value = {}\n difference significant? = {}\n".format(round(chi,8),round(p,8),sig))
ax1 = data.groupby(compare)[category].value_counts(normalize=True).unstack(level=-1)
g=ax1.plot(kind='bar', stacked=True,title=str(ax1),ax=axes[1])
plt.show()
int_level = data[compare].value_counts()
BV_Compare_Feature_Categorical(data, 'gender', 'churn')
Analysis/Insight:
the difference between the males and females customer churning is significant. Churn and Gender are dependent variables
# segregating customers into segments
churn = data[['churn','age']].copy()
churn['age_group'] = 'str'
churn['age_group'][churn['age']>=80] = 'very old'
churn['age_group'][(churn['age']<80) & (churn['age']>=60)] = 'senior citizen'
churn['age_group'][(churn['age']<60) & (churn['age']>=18)] = 'adult'
churn['age_group'][churn['age']<18] = 'young'
BV_Compare_Feature_Categorical(churn, 'age_group', 'churn')
Analysis/Insight: Age group has significant effect on the churning rate. Age and Churn are dependent Variables
BV_Compare_Feature_Categorical(data, 'customer_nw_category', 'churn')
Analysis/Insight: Different income brackets have significant effect on the churn rate. Customer Net-Worth and Churn are dependent Variables
# segregating dependents into categories
dependents = data[['churn','dependents']][:]
dependents.dropna()
dependents['dep_group'] = None
dependents['dep_group'][dependents['dependents']==0] = 'single'
dependents['dep_group'][(dependents['dependents']>=1) & (dependents['dependents']<=3)] = 'small family'
dependents['dep_group'][(dependents['dependents']>=4) & (dependents['dependents']<=9)] = 'large family'
dependents['dep_group'][(dependents['dependents']>=10)] = 'joint family'
BV_Compare_Feature_Categorical(dependents, 'dep_group', 'churn')
Analysis/Insight: Number of Dependents of customers and Churn are dependent variables
# segregating customers whose last transaction was more than 6 onths ago
transaction = data[['churn','moy_ls_tran']]
transaction['moy_ls_tran'] = transaction['moy_ls_tran'].map(lambda x: 'more than 6 months ago' if x<7 else 'within 6 months')
BV_Compare_Feature_Categorical(transaction, 'moy_ls_tran','churn')
# getting city codes which have less than 280 (1%) of accounts
total_accounts=data.shape[0]
tmp = data['city'].value_counts()
cities = tmp[tmp<total_accounts*.01].index
churn_acc = data[['churn','city']][:]
churn_acc['city_cat'] = None
churn_acc['city_cat'][churn_acc['city'].isin(cities[:])] = 'low accounts'
churn_acc['city_cat'][~churn_acc['city'].isin(cities[:])] = 'high accounts'
BV_Compare_Feature_Categorical(churn_acc, 'city_cat','churn' )
Analysis/Insight: Cities contrinuting less than 1% of data has a different churn rate
# getting branch codes with more than 1% of total accounts
total_accounts=data.shape[0]
tmp = data['branch_code'].value_counts()
branch = tmp[tmp<total_accounts*.01].index
# making two segments
churn_acc = data[['churn','branch_code']][:]
churn_acc['branch_cat'] = None
churn_acc['branch_cat'][churn_acc['branch_code'].isin(branch[:])] = 'low accounts'
churn_acc['branch_cat'][~churn_acc['branch_code'].isin(branch[:])] = 'high accounts'
BV_Compare_Feature_Categorical(churn_acc, 'branch_cat','churn' )
# isolating rows with missing gender
miss_gender = data[['gender','churn']]
miss_gender['missing_gender'] = 'not_missing'
miss_gender['missing_gender'][~miss_gender['gender'].isin(['Male','Female'])] = 'missing value'
#data_test=pd.read_csv("/Users/vipulbhatia29/JupyterAnalytics/storedata/test_dataset.csv")
#data_test.head()
#BV_Compare_Feature_Categorical(data_test, 'Tenure','Exited')
BV_Compare_Feature_Categorical(miss_gender, 'missing_gender', 'churn')
Analysis/Insight: Missing Gender has no effect on the customer behavior with respect to churn
# isolating rows with missing gender
miss_dependents = data[['dependents','churn']]
miss_dependents['missing_dependents']=None
miss_dependents['missing_dependents'][miss_dependents['dependents']>-1] = 'not_missing'
miss_dependents['missing_dependents'][miss_dependents['dependents']==-1] = 'missing value'
BV_Compare_Feature_Categorical(miss_dependents, 'missing_dependents', 'churn')
Analysis/Insight: Missing values in dependents have significantly different churning rate than the rest of the data.
# isolating rows with missing occupation
miss_occupation = data[['occupation','churn']]
miss_occupation['missing_occupation']=None
miss_occupation['missing_occupation'][miss_occupation['occupation'].isnull()]='missing_values'
miss_occupation['missing_occupation'][~(miss_occupation['occupation'].isnull())]='not_missing'
BV_Compare_Feature_Categorical(miss_occupation, 'missing_occupation', 'churn')
Analysis/Insight: Missing values in occupation does not have any significantly different relation with churn rate.
data['churn'].value_counts(normalize=True)
Objective: Identity the Churning Customers Profile using multiple categorical features
data.dtypes[data.dtypes=='category']
cmap=['Blues','Reds','Greens','Oranges','Purples','Yellows']
table1=data.pivot_table(columns='customer_nw_category',values='churn',index=['gender','occupation'],aggfunc=np.sum)
table1.style.background_gradient(cmap=cmap[0],subset=[1])\
.background_gradient(cmap=cmap[1],subset=[2])\
.background_gradient(cmap=cmap[2],subset=[3])
table1 = data.pivot_table(columns='customer_nw_category',values='churn',index=['gender','occupation'],aggfunc=np.mean)*100
table1.style.background_gradient(cmap=cmap[0],subset=[1])\
.background_gradient(cmap=cmap[1],subset=[2])\
.background_gradient(cmap=cmap[2],subset=[3]).format("{:.2f}")
Analysis/Insight Gathered:
Let's use one continuous variable Age, and two categorical variables, Gender, and Occupation to derive insights related to profiles of customers who are churning. We will have to convert continuous variable into categorical variable for efficiently using Pivot Table. Here we are binning age into three intervals
age = pd.cut(data['age'], [0, 25, 50, 100])
table1=data.pivot_table('churn', ['gender', age], 'occupation', aggfunc=np.sum)
cols=table1.columns
table1.style.background_gradient(cmap=cmap[0],subset=[cols[0]])\
.background_gradient(cmap=cmap[1],subset=[cols[1]])\
.background_gradient(cmap=cmap[2],subset=[cols[2]])\
.background_gradient(cmap=cmap[3],subset=[cols[3]])\
.background_gradient(cmap=cmap[4],subset=[cols[4]]).format("{:.0f}")
age = pd.cut(data['age'], [0, 25, 50, 100])
table1=data.pivot_table('churn', ['gender', age], 'occupation', aggfunc=np.mean)*100
cols=table1.columns
table1.style.background_gradient(cmap=cmap[0],subset=[cols[0]])\
.background_gradient(cmap=cmap[1],subset=[cols[1]])\
.background_gradient(cmap=cmap[2],subset=[cols[2]])\
.background_gradient(cmap=cmap[3],subset=[cols[3]])\
.background_gradient(cmap=cmap[4],subset=[cols[4]]).format("{:.2f}")
Let's dig deeper by using two continuous variables Age and Current Balance and Two Categorical Variable Gender and Occupation and try to find out the insights related to churning customers profile
balance = pd.qcut(data['current_balance'], 3)
table1=data.pivot_table('churn', ['gender', age], [balance, 'occupation'], aggfunc='sum')
table1.style.background_gradient(cmap=cmap[4]).format("{:.0f}")
balance = pd.qcut(data['current_balance'], 3)
table1=data.pivot_table('churn', ['gender', age], [balance, 'occupation'], aggfunc=np.mean)*100
table1.style.background_gradient(cmap=cmap[2]).format("{:.2f}")
def Grouped_Box_Plot(data, cont, cat1, cat2):
# boxplot
plt.figure(figsize=(15,6))
sns.boxplot(x=cat1, y=cont, hue=cat2, data=data, orient='v')
plt.title('Boxplot')
Grouped_Box_Plot(data,'age', 'occupation', 'churn')
We can notice here that
Grouped_Box_Plot(data,'vintage','gender', 'churn')